Decision Tree

Step 1: Load the Libraries
In [1]:
import warnings
warnings.filterwarnings('ignore')
In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

plt.style.use('fast')
sns.set_style('darkgrid')
Step 2: Load the data
In [4]:
Loan_train = pd.read_csv(r'C:\Users\amaresh.murthiraju\Documents\Course Material\ML\06 Decision Tree\Loan Prediction use case\LoanPred_train.csv')
Step 3: Investigate the Data Frame
In [5]:
Loan_train.head()
Out[5]:
Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area Loan_Status
0 LP001002 Male No 0 Graduate No 5849 0.0 NaN 360.0 1.0 Urban Y
1 LP001003 Male Yes 1 Graduate No 4583 1508.0 128.0 360.0 1.0 Rural N
2 LP001005 Male Yes 0 Graduate Yes 3000 0.0 66.0 360.0 1.0 Urban Y
3 LP001006 Male Yes 0 Not Graduate No 2583 2358.0 120.0 360.0 1.0 Urban Y
4 LP001008 Male No 0 Graduate No 6000 0.0 141.0 360.0 1.0 Urban Y
In [6]:
Loan_train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
Loan_ID              614 non-null object
Gender               601 non-null object
Married              611 non-null object
Dependents           599 non-null object
Education            614 non-null object
Self_Employed        582 non-null object
ApplicantIncome      614 non-null int64
CoapplicantIncome    614 non-null float64
LoanAmount           592 non-null float64
Loan_Amount_Term     600 non-null float64
Credit_History       564 non-null float64
Property_Area        614 non-null object
Loan_Status          614 non-null object
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB
In [7]:
Loan_train.describe()
Out[7]:
ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History
count 614.000000 614.000000 592.000000 600.00000 564.000000
mean 5403.459283 1621.245798 146.412162 342.00000 0.842199
std 6109.041673 2926.248369 85.587325 65.12041 0.364878
min 150.000000 0.000000 9.000000 12.00000 0.000000
25% 2877.500000 0.000000 100.000000 360.00000 1.000000
50% 3812.500000 1188.500000 128.000000 360.00000 1.000000
75% 5795.000000 2297.250000 168.000000 360.00000 1.000000
max 81000.000000 41667.000000 700.000000 480.00000 1.000000
In [9]:
# Check for missing values

Loan_train.isnull().sum()
Out[9]:
Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64
In [10]:
sns.pairplot(Loan_train)
Out[10]:
<seaborn.axisgrid.PairGrid at 0x21498207548>
Step 4: Numerical variable analysis
In [ ]:
# ApplicantIncome      614 non-null int64
# CoapplicantIncome    614 non-null float64
# LoanAmount           592 non-null float64
In [17]:
plt.figure(figsize=(20,10))

plt.subplot(221)
sns.distplot(Loan_train.ApplicantIncome)

plt.subplot(222)
sns.distplot(Loan_train.CoapplicantIncome)

plt.subplot(223)
sns.boxplot(Loan_train.ApplicantIncome)

plt.subplot(224)
sns.boxplot(Loan_train.CoapplicantIncome)


plt.show();
In [24]:
plt.figure(figsize=(20,5))

plt.subplot(121)
sns.distplot(Loan_train[Loan_train.notnull()['LoanAmount']]['LoanAmount'])

plt.subplot(122)
sns.boxplot(Loan_train.LoanAmount)


plt.show();
Handle missing values in numerical columns
In [25]:
Loan_train.LoanAmount.mean()
Out[25]:
146.41216216216216
In [ ]:
# Gender	Married	Dependents	Education	Self_Employed
In [30]:
Loan_train.Dependents.unique()
Out[30]:
array(['0', '1', '2', '3+', nan], dtype=object)
In [34]:
plt.figure(figsize=(20,15))

plt.subplot(221)
sns.boxplot(x= Loan_train.Gender, y= Loan_train.LoanAmount)

plt.subplot(222)
sns.boxplot(x= Loan_train.Married, y= Loan_train.LoanAmount);

plt.subplot(223)
sns.boxplot(x= Loan_train.Education, y= Loan_train.LoanAmount);

plt.subplot(224)
sns.boxplot(x= Loan_train.Self_Employed, y= Loan_train.LoanAmount);

plt.show();
In [36]:
Loan_train['LoanAmount'].fillna(Loan_train.LoanAmount.mean(),inplace = True)
In [43]:
# Check the Loan amount distribution after filling the missing values

plt.figure(dpi = 100)
sns.distplot(Loan_train.LoanAmount);
In [44]:
# Identifying outliers

# 1. Based on IQR
#         i. Values less than Q1 - 1.5*IQR
#         ii. Values greater than Q3 + 1.5*IQR


# 2. Values beyond mean +/- 2SD
In [46]:
np.percentile(Loan_train.LoanAmount, [25,75])
Out[46]:
array([100.25, 164.75])
In [47]:
Q1, Q3 = np.percentile(Loan_train.LoanAmount, [25,75])
In [49]:
Q1; Q3
Out[49]:
100.25
Out[49]:
164.75
In [51]:
IQR = Q3 - Q1; IQR
Out[51]:
64.5
In [52]:
LL = Q1 - 1.5*IQR
UL = Q3 + 1.5*IQR
In [53]:
LL; UL
Out[53]:
3.5
Out[53]:
261.5
In [ ]:
# mean +/- 2SD
In [60]:
Loan_train.LoanAmount.std()
Out[60]:
84.0374676831965
In [61]:
Loan_train.LoanAmount.mean() + 2*Loan_train.LoanAmount.std()
Out[61]:
314.4870975285551
In [62]:
Loan_train.LoanAmount.mean() - 2*Loan_train.LoanAmount.std()
Out[62]:
-21.66277320423086
In [57]:
Loan_train[(Loan_train.LoanAmount < 3.5) | (Loan_train.LoanAmount > 261.5)]
Out[57]:
Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area Loan_Status
5 LP001011 Male Yes 2 Graduate Yes 5417 4196.0 267.0 360.0 1.0 Urban Y
9 LP001020 Male Yes 1 Graduate No 12841 10968.0 349.0 360.0 1.0 Semiurban N
21 LP001046 Male Yes 1 Graduate No 5955 5625.0 315.0 360.0 1.0 Urban Y
34 LP001100 Male No 3+ Graduate No 12500 3000.0 320.0 360.0 1.0 Rural N
54 LP001186 Female Yes 1 Graduate Yes 11500 0.0 286.0 360.0 0.0 Urban N
67 LP001233 Male Yes 1 Graduate No 10750 0.0 312.0 360.0 1.0 Urban Y
83 LP001273 Male Yes 0 Graduate No 6000 2250.0 265.0 360.0 NaN Semiurban N
126 LP001448 NaN Yes 3+ Graduate No 23803 0.0 370.0 360.0 1.0 Rural Y
130 LP001469 Male No 0 Graduate Yes 20166 0.0 650.0 480.0 NaN Urban Y
135 LP001488 Male Yes 3+ Graduate No 4000 7750.0 290.0 360.0 1.0 Semiurban N
155 LP001536 Male Yes 3+ Graduate No 39999 0.0 600.0 180.0 0.0 Semiurban Y
161 LP001562 Male Yes 0 Graduate No 7933 0.0 275.0 360.0 1.0 Urban N
171 LP001585 NaN Yes 3+ Graduate No 51763 0.0 700.0 300.0 1.0 Urban Y
177 LP001610 Male Yes 3+ Graduate No 5516 11300.0 495.0 360.0 0.0 Semiurban N
233 LP001776 Female No 0 Graduate No 8333 0.0 280.0 360.0 1.0 Semiurban Y
253 LP001843 Male Yes 1 Not Graduate No 2661 7101.0 279.0 180.0 1.0 Semiurban Y
258 LP001859 Male Yes 0 Graduate No 14683 2100.0 304.0 360.0 1.0 Rural N
260 LP001865 Male Yes 1 Graduate No 6083 4250.0 330.0 360.0 NaN Urban Y
278 LP001907 Male Yes 0 Graduate No 14583 0.0 436.0 360.0 1.0 Semiurban Y
308 LP001996 Male No 0 Graduate No 20233 0.0 480.0 360.0 1.0 Rural N
324 LP002065 Male Yes 3+ Graduate No 15000 0.0 300.0 360.0 1.0 Rural Y
325 LP002067 Male Yes 1 Graduate Yes 8666 4983.0 376.0 360.0 0.0 Rural N
333 LP002101 Male Yes 0 Graduate NaN 63337 0.0 490.0 180.0 1.0 Urban Y
351 LP002140 Male No 0 Graduate No 8750 4167.0 308.0 360.0 1.0 Rural N
369 LP002191 Male Yes 0 Graduate No 19730 5266.0 570.0 360.0 1.0 Rural N
372 LP002201 Male Yes 2 Graduate Yes 9323 7873.0 380.0 300.0 1.0 Rural Y
381 LP002229 Male No 0 Graduate No 5941 4232.0 296.0 360.0 1.0 Semiurban Y
391 LP002262 Male Yes 3+ Graduate No 9504 0.0 275.0 360.0 1.0 Rural Y
409 LP002317 Male Yes 3+ Graduate No 81000 0.0 360.0 360.0 0.0 Rural N
432 LP002386 Male No 0 Graduate NaN 12876 0.0 405.0 360.0 1.0 Semiurban Y
487 LP002547 Male Yes 1 Graduate No 18333 0.0 500.0 360.0 1.0 Urban N
506 LP002624 Male Yes 0 Graduate No 20833 6667.0 480.0 360.0 NaN Urban Y
514 LP002652 Male No 0 Graduate No 5815 3666.0 311.0 360.0 1.0 Rural N
523 LP002693 Male Yes 2 Graduate Yes 7948 7166.0 480.0 360.0 1.0 Rural Y
525 LP002699 Male Yes 2 Graduate Yes 17500 0.0 400.0 360.0 1.0 Rural Y
536 LP002734 Male Yes 0 Graduate No 6133 3906.0 324.0 360.0 1.0 Urban Y
561 LP002813 Female Yes 1 Graduate Yes 19484 0.0 600.0 360.0 1.0 Semiurban Y
572 LP002855 Male Yes 2 Graduate No 16666 0.0 275.0 360.0 1.0 Urban Y
592 LP002933 NaN No 3+ Graduate Yes 9357 0.0 292.0 360.0 1.0 Semiurban Y
600 LP002949 Female No 3+ Graduate NaN 416 41667.0 350.0 180.0 NaN Urban N
604 LP002959 Female Yes 1 Graduate No 12000 0.0 496.0 360.0 1.0 Semiurban Y
In [67]:
# transforming the Loan Amount column - Apply log


# plt.figure(dpi = 200)
plt.figure(figsize=(20,7))

plt.subplot(121)
sns.distplot(Loan_train.LoanAmount)

plt.subplot(122)
sns.distplot(np.log(Loan_train.LoanAmount))

plt.show();
In [68]:
Loan_train['LoanAmount_log'] = np.log(Loan_train.LoanAmount)
In [69]:
# Applicant Income & Coapplicant Income
In [70]:
Loan_train['TotalIncome'] = Loan_train.ApplicantIncome + Loan_train.CoapplicantIncome
In [71]:
plt.figure(figsize=(20,7))

plt.subplot(121)
sns.distplot(Loan_train.TotalIncome)

plt.subplot(122)
sns.distplot(np.log(Loan_train.TotalIncome))

plt.show();
In [72]:
Loan_train['TotalIncome_log'] = np.log(Loan_train.TotalIncome)
In [73]:
# dataframe after numerical varlable analysis

Loan_train.head(10)
Out[73]:
Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area Loan_Status LoanAmount_log TotalIncome TotalIncome_log
0 LP001002 Male No 0 Graduate No 5849 0.0 146.412162 360.0 1.0 Urban Y 4.986426 5849.0 8.674026
1 LP001003 Male Yes 1 Graduate No 4583 1508.0 128.000000 360.0 1.0 Rural N 4.852030 6091.0 8.714568
2 LP001005 Male Yes 0 Graduate Yes 3000 0.0 66.000000 360.0 1.0 Urban Y 4.189655 3000.0 8.006368
3 LP001006 Male Yes 0 Not Graduate No 2583 2358.0 120.000000 360.0 1.0 Urban Y 4.787492 4941.0 8.505323
4 LP001008 Male No 0 Graduate No 6000 0.0 141.000000 360.0 1.0 Urban Y 4.948760 6000.0 8.699515
5 LP001011 Male Yes 2 Graduate Yes 5417 4196.0 267.000000 360.0 1.0 Urban Y 5.587249 9613.0 9.170872
6 LP001013 Male Yes 0 Not Graduate No 2333 1516.0 95.000000 360.0 1.0 Urban Y 4.553877 3849.0 8.255569
7 LP001014 Male Yes 3+ Graduate No 3036 2504.0 158.000000 360.0 0.0 Semiurban N 5.062595 5540.0 8.619750
8 LP001018 Male Yes 2 Graduate No 4006 1526.0 168.000000 360.0 1.0 Urban Y 5.123964 5532.0 8.618305
9 LP001020 Male Yes 1 Graduate No 12841 10968.0 349.000000 360.0 1.0 Semiurban N 5.855072 23809.0 10.077819

Step 5: Categorical variable analysis

In [79]:
Loan_train.dtypes.index
Out[79]:
Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status',
       'LoanAmount_log', 'TotalIncome', 'TotalIncome_log'],
      dtype='object')
In [84]:
Loan_train.dtypes.values
Out[84]:
array([dtype('O'), dtype('O'), dtype('O'), dtype('O'), dtype('O'),
       dtype('O'), dtype('int64'), dtype('float64'), dtype('float64'),
       dtype('float64'), dtype('float64'), dtype('O'), dtype('O'),
       dtype('float64'), dtype('float64'), dtype('float64')], dtype=object)
In [92]:
Loan_train.Gender.value_counts()
Loan_train.Married.value_counts()
Loan_train.Dependents.value_counts()
Loan_train.Education.value_counts()
Loan_train.Self_Employed.value_counts()
Loan_train.Credit_History.value_counts()
Loan_train.Property_Area.value_counts()
Loan_train.Loan_Status.value_counts()
Out[92]:
Male      489
Female    112
Name: Gender, dtype: int64
Out[92]:
Yes    398
No     213
Name: Married, dtype: int64
Out[92]:
0     345
1     102
2     101
3+     51
Name: Dependents, dtype: int64
Out[92]:
Graduate        480
Not Graduate    134
Name: Education, dtype: int64
Out[92]:
No     500
Yes     82
Name: Self_Employed, dtype: int64
Out[92]:
1.0    475
0.0     89
Name: Credit_History, dtype: int64
Out[92]:
Semiurban    233
Urban        202
Rural        179
Name: Property_Area, dtype: int64
Out[92]:
Y    422
N    192
Name: Loan_Status, dtype: int64
In [89]:
Loan_train.Dependents.value_counts().plot(kind='bar');
In [93]:
pd.crosstab(Loan_train.Education, Loan_train.Gender)
Out[93]:
Gender Female Male
Education
Graduate 92 376
Not Graduate 20 113
In [95]:
pd.crosstab(Loan_train.Education, Loan_train.Gender).plot(kind = 'bar');
In [98]:
pd.crosstab(Loan_train.Education, Loan_train.Gender).plot(kind = 'bar', stacked = True, grid = False);
In [99]:
pd.crosstab([Loan_train.Education, Loan_train.Gender], Loan_train.Loan_Status)
Out[99]:
Loan_Status N Y
Education Gender
Graduate Female 31 61
Male 105 271
Not Graduate Female 6 14
Male 45 68
In [102]:
pd.crosstab([Loan_train.Education, Loan_train.Gender], Loan_train.Loan_Status).plot(kind = 'bar');
In [103]:
pd.crosstab([Loan_train.Education, Loan_train.Gender], Loan_train.Loan_Status).plot(kind = 'bar', stacked = True);
In [108]:
# Handle missing values

Loan_train.isna().sum()
Out[108]:
Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
LoanAmount_log        0
TotalIncome           0
TotalIncome_log       0
dtype: int64
In [109]:
missing_cat_cols = ['Gender', 'Married','Dependents', 'Self_Employed', 'Loan_Amount_Term', 'Credit_History']
In [110]:
Loan_train.Gender.value_counts()
Out[110]:
Male      489
Female    112
Name: Gender, dtype: int64
In [113]:
Loan_train.Gender.mode()[0]
Out[113]:
'Male'
In [114]:
for col_name in missing_cat_cols:
    Loan_train[col_name].fillna(Loan_train[col_name].mode()[0], inplace=True)
In [115]:
Loan_train.isna().sum()
Out[115]:
Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
LoanAmount_log       0
TotalIncome          0
TotalIncome_log      0
dtype: int64
In [118]:
Loan_train.head(10)
Out[118]:
Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area Loan_Status LoanAmount_log TotalIncome TotalIncome_log
0 LP001002 Male No 0 Graduate No 5849 0.0 146.412162 360.0 1.0 Urban Y 4.986426 5849.0 8.674026
1 LP001003 Male Yes 1 Graduate No 4583 1508.0 128.000000 360.0 1.0 Rural N 4.852030 6091.0 8.714568
2 LP001005 Male Yes 0 Graduate Yes 3000 0.0 66.000000 360.0 1.0 Urban Y 4.189655 3000.0 8.006368
3 LP001006 Male Yes 0 Not Graduate No 2583 2358.0 120.000000 360.0 1.0 Urban Y 4.787492 4941.0 8.505323
4 LP001008 Male No 0 Graduate No 6000 0.0 141.000000 360.0 1.0 Urban Y 4.948760 6000.0 8.699515
5 LP001011 Male Yes 2 Graduate Yes 5417 4196.0 267.000000 360.0 1.0 Urban Y 5.587249 9613.0 9.170872
6 LP001013 Male Yes 0 Not Graduate No 2333 1516.0 95.000000 360.0 1.0 Urban Y 4.553877 3849.0 8.255569
7 LP001014 Male Yes 3+ Graduate No 3036 2504.0 158.000000 360.0 0.0 Semiurban N 5.062595 5540.0 8.619750
8 LP001018 Male Yes 2 Graduate No 4006 1526.0 168.000000 360.0 1.0 Urban Y 5.123964 5532.0 8.618305
9 LP001020 Male Yes 1 Graduate No 12841 10968.0 349.000000 360.0 1.0 Semiurban N 5.855072 23809.0 10.077819
In [120]:
Loan_train_bak = Loan_train.copy()
In [121]:
# Encode the Categorical columns

encode_cols = ['Gender', 'Married','Dependents', 'Education', 'Self_Employed', 'Property_Area','Loan_Status' ]
In [122]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for x in encode_cols:
    Loan_train[x] = le.fit_transform(Loan_train[x])
In [123]:
# dataframe after Categorical analysis

Loan_train.head(10)
Out[123]:
Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area Loan_Status LoanAmount_log TotalIncome TotalIncome_log
0 LP001002 1 0 0 0 0 5849 0.0 146.412162 360.0 1.0 2 1 4.986426 5849.0 8.674026
1 LP001003 1 1 1 0 0 4583 1508.0 128.000000 360.0 1.0 0 0 4.852030 6091.0 8.714568
2 LP001005 1 1 0 0 1 3000 0.0 66.000000 360.0 1.0 2 1 4.189655 3000.0 8.006368
3 LP001006 1 1 0 1 0 2583 2358.0 120.000000 360.0 1.0 2 1 4.787492 4941.0 8.505323
4 LP001008 1 0 0 0 0 6000 0.0 141.000000 360.0 1.0 2 1 4.948760 6000.0 8.699515
5 LP001011 1 1 2 0 1 5417 4196.0 267.000000 360.0 1.0 2 1 5.587249 9613.0 9.170872
6 LP001013 1 1 0 1 0 2333 1516.0 95.000000 360.0 1.0 2 1 4.553877 3849.0 8.255569
7 LP001014 1 1 3 0 0 3036 2504.0 158.000000 360.0 0.0 1 0 5.062595 5540.0 8.619750
8 LP001018 1 1 2 0 0 4006 1526.0 168.000000 360.0 1.0 2 1 5.123964 5532.0 8.618305
9 LP001020 1 1 1 0 0 12841 10968.0 349.000000 360.0 1.0 1 0 5.855072 23809.0 10.077819

Step 6: Model Building

In [135]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
In [125]:
Loan_train.columns
Out[125]:
Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status',
       'LoanAmount_log', 'TotalIncome', 'TotalIncome_log'],
      dtype='object')
In [126]:
feature_cols = ['Gender', 'Married', 'Dependents', 'Education',
                   'Self_Employed','Loan_Amount_Term', 'Credit_History', 
                    'Property_Area','LoanAmount_log','TotalIncome_log']
In [129]:
X = Loan_train[feature_cols]
y = Loan_train.Loan_Status
In [131]:
DT_model = DecisionTreeClassifier()
In [132]:
DT_model.fit(X,y)
Out[132]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')
In [133]:
pred = DT_model.predict(X)
In [134]:
accuracy_score(y, pred)
Out[134]:
1.0
In [138]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 0)
In [139]:
X_train.shape
X_test.shape
Out[139]:
(521, 10)
Out[139]:
(93, 10)
In [140]:
DT_model1 = DecisionTreeClassifier()
In [141]:
DT_model1.fit(X_train, y_train)
Out[141]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')
In [143]:
pred_X_train = DT_model1.predict(X_train)
In [144]:
accuracy_score(y_train, pred_X_train)
Out[144]:
1.0
In [145]:
pred_X_test = DT_model1.predict(X_test)
In [146]:
accuracy_score(y_test, pred_X_test)
Out[146]:
0.6881720430107527

Step 9: Model tuning - Feature selection

In [147]:
#RFE

from sklearn.feature_selection import RFE

feature_cols = ['Gender', 'Married', 'Dependents', 'Education',
                   'Self_Employed','Loan_Amount_Term', 'Credit_History', 
                    'Property_Area','LoanAmount_log','TotalIncome_log']

X = Loan_train[feature_cols]
y = Loan_train.Loan_Status

model = DecisionTreeClassifier()

rfe = RFE(model, 4)

fit = rfe.fit(X, y)
In [148]:
print(fit.n_features_)
print(fit.support_)
print(fit.ranking_)
4
[False False  True False False False  True False  True  True]
[4 7 1 6 5 3 1 2 1 1]
In [149]:
for i, x in enumerate(fit.ranking_):
    if(x == 1):
        print(X.columns[i])
Dependents
Credit_History
LoanAmount_log
TotalIncome_log
In [150]:
# Create a new DT model with selected features

feature_cols = ['Dependents','Credit_History', 'LoanAmount_log','TotalIncome_log']

X = Loan_train[feature_cols]
y = Loan_train.Loan_Status
In [151]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 0)
In [152]:
X_train.shape
X_test.shape
Out[152]:
(521, 4)
Out[152]:
(93, 4)
In [153]:
DT_model2 = DecisionTreeClassifier()
In [154]:
DT_model2.fit(X_train, y_train)
Out[154]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')
In [155]:
pred_X_train = DT_model2.predict(X_train)
In [156]:
accuracy_score(y_train, pred_X_train)
Out[156]:
1.0
In [157]:
pred_X_test = DT_model2.predict(X_test)
In [158]:
accuracy_score(y_test, pred_X_test)
Out[158]:
0.6344086021505376
In [ ]:
 

Generate the Decision Tree

You need to install pydotplus and graphviz. These can be installed with your package manager and pip. Graphviz is a tool for drawing graphics using dot files. Pydotplus is a module to Graphviz’s Dot language.

In [159]:
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
In [160]:
import os
os.getcwd()
Out[160]:
'C:\\Users\\amaresh.murthiraju\\Python\\Pyhton - Machine Learning - Innomatics - 2019\\09 Decision Tree'
In [161]:
# Create DOT data

#export_graphviz(DT_model2, out_file='tree_bw.dot', 
                                #feature_names=feature_cols2)

dot_data = export_graphviz(DT_model2, out_file=None, 
                                feature_names=feature_cols)

# Draw graph
graph = pydotplus.graph_from_dot_data(dot_data)  

# Show graph
Image(graph.create_png())
Out[161]:
In [162]:
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus

import collections

# Create DOT data
#export_graphviz(DT_model2, out_file='tree_color.dot', 
                                #feature_names=feature_cols2,filled=True,\
                           #rounded=True)

dot_data = export_graphviz(DT_model2, out_file=None, 
                                feature_names=feature_cols,filled=True,\
                           rounded=True, class_names=['No','Yes'], )

# Draw graph
graph = pydotplus.graph_from_dot_data(dot_data)  

colors = ('yellow', 'grey')

edges = collections.defaultdict(list)


for edge in graph.get_edge_list():
    edges[edge.get_source()].append(int(edge.get_destination()))

for edge in edges:
    edges[edge].sort()    
    for i in range(2):
        dest = graph.get_node(str(edges[edge][i]))[0]
        dest.set_fillcolor(colors[i])
# Show graph
Image(graph.create_png())
graph.write_png('tree.png')
Out[162]:
Out[162]:
True
In [163]:
dot_data = StringIO()  
export_graphviz(DT_model2, out_file=dot_data,  
                         feature_names=feature_cols,  
                         class_names=['No','Yes'],  
                         filled=True, rounded=True,  
                         special_characters=True)  
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())
graph.write_png('tree.png')
Out[163]:
Out[163]:
True
In [ ]:
 
In [ ]: